cls
clear all
local dir "~/Dropbox/Working Papers/Distinction Effect"
cd "`dir'"

qui do "Does/Data Processing/0. Cleaning Programs.do"

/*******************************************************************************
					1. Saber 11 IDs
********************************************************************************/

use "Data/Originals/Saber11/Saber 11 1996 2014.dta", clear

keep icfes_snp-icfes_estudiante_genero

rename icfes_semestre sb11sem
rename icfes_ano sb11year
rename icfes_per sb11time
rename icfes_estudiante_documento_tipo national_id_type
rename icfes_estudiante_documento_numer national_id
rename icfes_estudiante_nombre sb11_name
rename icfes_fechanto birthdate
rename icfes_estudiante_edad age
rename icfes_estudiante_genero gender

drop if icfes_snp == ""
duplicates drop icfes_snp, force

bys national*: egen aux = min(sb11time)
keep if sb11time == aux
drop aux

destring national_id, gen(aux) force
drop if aux == .
drop if aux <= 9999
drop aux

spell sb11_name, clean(sb11_name) up(sb11_name)

replace national_id_type = upper(national_id_type)

duplicates drop national*, force
save "Data/Originals/Saber11/Saber 11 IDs", replace

/*******************************************************************************
				2. Merge Saber Pro + Saber 11
********************************************************************************/

use "Data/Intermediates/SaberPro_2006-2010.dta", clear

*sample 20000, count

gen spro_name = spro_surname1 + " " + spro_surname2 + " " + spro_name1 + " " + spro_name2
spell spro_name, clean(spro_name) up(spro_name)

keep consecutivo national* spro_name spro_birthdate spro_gender spro_sb11* exam_year

foreach var of varlist _all {
    if regexm("`var'", "spro_") & "`var'" != "spro_name" {
	    local newvar = subinstr("`var'", "spro_", "", .)
		rename `var' `newvar'
	}
}

replace sb11year = . if exam_year < sb11year
tostring sb11year, replace
replace sb11year = "" if sb11year == "."

gen age = .
replace age = (date("01-04-"+sb11year, "DMY") - birthdate)/365.25 if sb11sem  == 1
replace age = (date("01-09-"+sb11year, "DMY") - birthdate)/365.25 if sb11sem  == 2
replace age = floor(age)
replace age = . if age <= 13 | age >= 36
destring sb11year, replace

duplicates drop national*, force

save "Data/Intermediates/SaberPro_IDs", replace

/*******************************************************************************
2.1 Merge Using IDs + Cleaning using Name, Birthdate and Gender
********************************************************************************/
//IDs from both datasets
use "Data/Intermediates/SaberPro_IDs", clear
rename birthdate spro_birthdate
rename age spro_age
rename gender spro_gender
	
merge 1:m national* using "Data/Originals/Saber11/Saber 11 IDs"
keep if _merge == 3
drop _merge

matchit spro_name sb11_name, sim(ngram,2) generate(phone_score)
	
gen aux = 1 if phone_score < 0.6
replace aux = 1 if phone_score > 0.6 & phone_score <= 0.7 /// 
& spro_birthdate == birthdate & birthdate != . ///
& spro_gender == gender & gender != .
drop if aux == 1
drop aux
	
keep national* icfes_snp
save "Data/Intermediates/SaberProSaber11", replace

//IDs from resgistraduria	
use "Data/Intermediates/SaberPro_IDs", clear
merge 1:1 national* using "Data/Intermediates/SaberProSaber11", keepusing(national*)
keep if _merge == 1
drop _merge

rename birthdate spro_birthdate
rename age spro_age
rename gender spro_gender

keep if national_id_type == "C"
destring national_id, replace force
drop if national_id == .
duplicates drop national_id, force
merge 1:1 national_id using "Data/Originals/Registraduria/Registraduria_CC.dta"
keep if _merge == 3
drop _merge
	
rename national_id spro_id
rename national_id_type spro_idtype
rename reg_national_id_ti national_id
gen national_id_type = "T"
	
keep if national_id != .

gen reg_name = reg_apellido1 + " " + reg_apellido2 + " " + reg_nombre1 + " " + reg_nombre2
spell reg_name, clean(reg_name) up(reg_name)
	
matchit spro_name reg_name, sim(ngram,2) generate(phone_score)
	
gen aux = 1 if phone_score < 0.6
replace aux = 1 if phone_score > 0.6 & phone_score <= 0.7 /// 
& spro_birthdate == reg_birth_date & reg_birth_date != .
drop if aux == 1
drop aux phone_score
	
tostring national_id, replace format(%20.0g)
merge 1:1 national* using "Data/Originals/Saber11/Saber 11 IDs"
keep if _merge == 3
drop _merge

matchit spro_name sb11_name, sim(ngram,2) generate(phone_score)	

gen aux = 1 if phone_score < 0.6
replace aux = 1 if phone_score > 0.6 & phone_score <= 0.7 /// 
& spro_birthdate == birthdate & birthdate != . ///
& spro_gender == gender & gender != .
drop if aux == 1
drop aux
	
drop national*
rename spro_id national_id
rename spro_idtype national_id_type
	
tostring national_id, replace format(%20.0g)

keep national* icfes_snp
append using "Data/Intermediates/SaberProSaber11"
save "Data/Intermediates/SaberProSaber11", replace

/*******************************************************************************
2.2 Merge Using Name + Birthdate + Gender + Saber 11 Year
********************************************************************************/
use "Data/Intermediates/SaberPro_IDs", clear
merge 1:1 national* using "Data/Intermediates/SaberProSaber11", keepusing(national*)
keep if _merge == 1
drop _merge

rename gender aux
gen gender = aux == 0 if  aux != .
drop aux

drop if birthdate == .
drop if gender == .
drop if sb11year == .
joinby birthdate gender sb11year using "Data/Originals/Saber11/Saber 11 IDs"

matchit spro_name sb11_name, sim(ngram,2) generate(phone_score)	
	
gen aux = 1 if phone_score < 0.8
drop if aux == 1
drop aux
	
bys national* : egen aux = max(phone_score)
keep if aux == phone_score
drop aux
	
bys national* : egen aux1 = seq()
bys national* : egen aux2 = max(aux1)
drop if aux2 > 1
drop aux?
	
keep national* icfes_snp
append using "Data/Intermediates/SaberProSaber11"
save "Data/Intermediates/SaberProSaber11", replace

/*******************************************************************************
2.3 Merge Using Name + Age by Saber 11 + Gender + Saber 11 Year
********************************************************************************/
use "Data/Intermediates/SaberPro_IDs", clear
merge 1:1 national* using "Data/Intermediates/SaberProSaber11", keepusing(national*)
keep if _merge == 1
drop _merge

drop if age == .
drop if sb11year == .
rename spro_name sb11_name
joinby sb11_name age sb11year using "Data/Originals/Saber11/Saber 11 IDs"
	
keep national* icfes_snp
	
bys national* : egen aux1 = seq()
bys national* : egen aux2 = max(aux1)
drop if aux2 > 1
drop aux?
	
append using "Data/Intermediates/SaberProSaber11"
save "Data/Intermediates/SaberProSaber11", replace

/*******************************************************************************
2.4 Append Merge by MEN + Cleaning 2.1 to 2.3 
********************************************************************************/
use "Data/Intermediates/SaberPro_IDs", clear

merge 1:1 national* using "Data/Intermediates/SaberProSaber11"
keep if _merge == 3 | _merge == 1
drop _merge

duplicates tag icfes_snp, gen(dup)
replace icfes_snp = "" if dup == 1 | dup == 2
drop dup

//Include IDs from Merge Process by MEN
drop if consecutivo == ""
rename icfes_snp aux
merge 1:1 consecutivo using "Data/Intermediates/SaberProSaber11_MEN.dta", keepusing(icfes_snp)
keep if _merge == 3 | _merge == 1
drop _merge

replace aux = icfes_snp if aux == ""
drop icfes_snp
rename aux icfes_snp

duplicates tag icfes_snp, gen(dup)
replace icfes_snp = "" if dup == 1
drop dup

tempfile SaberProSaber11_Complete
save `SaberProSaber11_Complete', replace

//Final Cleaning
merge m:1 icfes_snp using "Data/Originals/Saber11/Saber 11 IDs", keepusing(sb11_name)
keep if _merge == 3
drop _merge

matchit spro_name sb11_name, sim(ngram,2) generate(phone_score1)	
matchit spro_name sb11_name, sim(token_soundex) generate(phone_score2)	
drop sb11_name

replace icfes_snp = "" if phone_score1 <= 0.68
replace icfes_snp = "" if phone_score1 <= 0.75 & phone_score2 >= 0.34  & phone_score2 <= 0.7
drop if icfes_snp == "" //889 observations
drop phone_score*

//Append Data Dropped "Final Cleaning"
merge 1:1 consecutivo using `SaberProSaber11_Complete'
drop _merge

count
duplicates report consecutivo

keep consecutivo national* icfes_snp
save "Data/Intermediates/SaberProSaber11", replace

/*******************************************************************************
2.5 Merge Using Birthdate and Name (Perfect Match)
********************************************************************************/
use "Data/Intermediates/SaberPro_IDs", clear

merge 1:1 national* using "Data/Intermediates/SaberProSaber11"
keep if _merge == 3 | _merge == 1
drop _merge

merge m:1 icfes_snp using "Data/Originals/Saber11/Saber 11 IDs"
keep if _merge == 1
drop _merge

drop if birthdate == .
drop if gender == .
drop if spro_name == ""
keep consecutivo national_id national_id_type spro_name birthdate gender

duplicates tag spro_name birthdate, gen(dup)
drop if dup > 0
drop dup

rename spro_name sb11_name
joinby sb11_name birthdate using "Data/Originals/Saber11/Saber 11 IDs"

bys national* : egen min = min(sb11time)
keep if sb11time == min
drop min

duplicates tag national*, gen(dup)
drop if dup > 0
drop dup

keep consecutivo national* icfes_snp

tempfile Merge
save `Merge', replace

use "Data/Intermediates/SaberProSaber11", clear

rename icfes_snp aux 
merge 1:1 consecutivo using `Merge'
replace aux = icfes_snp if _merge == 3
drop icfes_snp _merge
rename aux icfes_snp

save "Data/Intermediates/SaberProSaber11", replace

/*******************************************************************************
2.6 Merge Using Names (Perfct Match)
********************************************************************************/
use "Data/Intermediates/SaberPro_IDs", clear

merge 1:1 national* using "Data/Intermediates/SaberProSaber11"
keep if _merge == 3 | _merge == 1
drop _merge

merge m:1 icfes_snp using "Data/Originals/Saber11/Saber 11 IDs"
keep if _merge == 1
drop _merge

drop if spro_name == ""
keep consecutivo national_id national_id_type spro_name

duplicates tag spro_name, gen(dup)
drop if dup > 0
drop dup

rename spro_name sb11_name
joinby sb11_name using "Data/Originals/Saber11/Saber 11 IDs"

bys national* : egen min = min(sb11time)
keep if sb11time == min
drop min

duplicates tag national*, gen(dup)
drop if dup > 0
drop dup

keep consecutivo national* icfes_snp

tempfile Merge
save `Merge', replace

use "Data/Intermediates/SaberProSaber11", clear

rename icfes_snp aux 
merge 1:1 consecutivo using `Merge'
replace aux = icfes_snp if _merge == 3
drop icfes_snp _merge
rename aux icfes_snp

duplicates tag icfes_snp, gen(dup)
replace icfes_snp = "" if dup > 0
drop dup

save "Data/Intermediates/SaberProSaber11", replace

/*******************************************************************************
2.7 Merge Using Names and Birthdates
********************************************************************************/
use "Data/Intermediates/SaberPro_IDs", clear

merge 1:1 national* using "Data/Intermediates/SaberProSaber11"
keep if _merge == 3 | _merge == 1
drop _merge

merge m:1 icfes_snp using "Data/Originals/Saber11/Saber 11 IDs"
keep if _merge == 1
drop _merge

rename gender spro_gender
drop if spro_gender == .
drop if birthdate == .
drop if spro_name == ""
keep consecutivo national_id national_id_type spro_name spro_gender birthdate 

joinby birthdate using "Data/Originals/Saber11/Saber 11 IDs"

matchit spro_name sb11_name, sim(ngram,2) generate(phone_score)
drop if phone_score <= 0.8
bys national* : egen double max = max(phone_score)
keep if phone_score == max
drop max

bys national* : egen double min = min(sb11time)
keep if sb11time == min
drop min

duplicates tag national*, gen(dup)
drop if dup > 0
drop dup

rename gender aux 
gen gender = aux == 0 if aux != .
drop aux
drop if phone_score <= 0.9 & spro_gender != gender & gender != .

tempfile Merge
save `Merge', replace

use "Data/Intermediates/SaberProSaber11", clear

rename icfes_snp aux 
merge 1:1 consecutivo using `Merge', keepusing(icfes_snp)
replace aux = icfes_snp if _merge == 3
drop icfes_snp _merge
rename aux icfes_snp

duplicates tag icfes_snp, gen(dup)
replace icfes_snp = "" if dup > 0
drop dup

save "Data/Intermediates/SaberProSaber11", replace


/***************************************************************
3. Merge Here Information from Saber 11 Universe
***************************************************************/
use "Data/Originals/Saber11/Saber 11 1996 2014.dta", clear

rename icfes_semestre sb11_sem
rename icfes_ano sb11_year
rename icfes_per sb11_time
rename icfes_estudiante_documento_tipo sb11_nationalid_type
rename icfes_estudiante_documento_numer sb11_nationalid
rename icfes_estudiante_nombre sb11_name
rename icfes_estudiante_genero sb11_gender
rename icfes_fechanto sb11_birthdate
rename icfes_estudiante_edad sb11_age
rename icfes_municipio sb11_municipality
rename icfes_estudiante_colegio_codigo sb11_schid_icfes
rename icfes_codigo_dane sb11_schid_dane
rename icfes_estudiante_colegio_jornada sb11_schtime
rename icfes_estudiante_trabaja sb11_work
rename icfes_puntaje_matematicas sb11_math_rawscore
rename icfes_puntaje_lenguaje sb11_reading_rawscore
rename icfes_puntaje_biologia sb11_biology_rawscore
rename icfes_puntaje_fisica sb11_physics_rawscore
rename icfes_puntaje_quimica sb11_chemistry_rawscore
rename icfes_puntaje_geografia sb11_geography_rawscore
rename icfes_puntaje_historia sb11_history_rawscore
rename icfes_puntaje_filosofia sb11_philosophy_rawscore
rename icfes_puntaje_idioma sb11_frglanguage_rawscore
rename icfes_idioma sb11_frglanguage_id
rename icfes_puntaje_total sb11_overall_pctile

replace sb11_reading_rawscore = icfes_lectura_critica if sb11_reading_rawscore == .
egen double sb11_socialsc_rawscore = rowmean(sb11_geography sb11_history icfes_puntaje_sociales icfes_sociales_ciudadanas)
replace sb11_socialsc_rawscore = round(sb11_socialsc, 0.01)
egen double sb11_naturalsc_rawscore = rowmean(sb11_biology sb11_physics sb11_chemistry icfes_ciencias_naturales)
replace sb11_naturalsc_rawscore = round(sb11_natural, 0.01)
egen double sb11_overall_rawscore = rowmean(sb11_math sb11_reading sb11_socialsc sb11_naturalsc sb11_frglanguage_raw)
replace sb11_overall_rawscore = round(sb11_overall_raw, 0.01)

keep icfes_snp sb11*

//Standardize Scores
foreach score in math reading socialsc naturalsc frglanguage overall {
    replace sb11_`score'_rawscore = . if sb11_`score'_rawscore <= 0
	bys sb11_time : egen double aux_mean = mean(sb11_`score'_rawscore)
    bys sb11_time : egen double aux_sd = sd(sb11_`score'_rawscore)
	gen sb11_`score'_sd = (sb11_`score'_rawscore - aux_mean)/aux_sd
	drop aux*
}

drop if icfes_snp == ""
bys icfes_snp : egen min = min(sb11_time)
keep if sb11_time == min
drop min

duplicates drop icfes_snp, force 

merge 1:m icfes_snp using "Data/Intermediates/SaberProSaber11", keepusing(consecutivo national*)
keep if _merge == 3
drop _merge

save "Data/Intermediates/SaberProSaber11_Complete", replace
